library(mosaic)
library(tidyverse)
library(lubridate)
library(DataComputing)
library(rvest)
library(broom)
library(rworldmap)
As COVID-19 spreads at an alarming rate, a pressing question at a global scale emerges– what factors of a country contribute to the spread of Coronavirus. We hope to analyze the relationship between a country’s population level, population density, and continent categorization on the spread of COVID-19.
COVID <- read.csv(file = "total-covid-cases-deaths-per-million.csv")
COVID
COVID %>%
nrow()
[1] 9487
COVID %>%
names()
[1] "total.covid.cases.deaths.per.million" "X"
[3] "X.1" "X.2"
[5] "X.3" "X.4"
[7] "X.5" "X.6"
[9] "X.7" "X.8"
[11] "X.9" "X.10"
[13] "X.11" "X.12"
[15] "X.13" "X.14"
[17] "X.15" "X.16"
[19] "X.17" "X.18"
[21] "X.19" "X.20"
[23] "X.21" "X.22"
[25] "X.23" "X.24"
[27] "X.25" "X.26"
[29] "X.27" "X.28"
[31] "X.29" "X.30"
[33] "X.31" "X.32"
[35] "X.33" "X.34"
[37] "X.35" "X.36"
[39] "X.37" "X.38"
[41] "X.39" "X.40"
[43] "X.41" "X.42"
[45] "X.43" "X.44"
[47] "X.45" "X.46"
[49] "X.47" "X.48"
[51] "X.49" "X.50"
[53] "X.51" "X.52"
[55] "X.53" "X.54"
[57] "X.55" "X.56"
[59] "X.57" "X.58"
[61] "X.59" "X.60"
[63] "X.61" "X.62"
[65] "X.63" "X.64"
[67] "X.65" "X.66"
[69] "X.67" "X.68"
[71] "X.69" "X.70"
[73] "X.71" "X.72"
[75] "X.73" "X.74"
[77] "X.75" "X.76"
[79] "X.77" "X.78"
[81] "X.79" "X.80"
[83] "X.81" "X.82"
[85] "X.83" "X.84"
[87] "X.85" "X.86"
[89] "X.87" "X.88"
[91] "X.89" "X.90"
[93] "X.91" "X.92"
[95] "X.93" "X.94"
[97] "X.95" "X.96"
[99] "X.97" "X.98"
[101] "X.99" "X.100"
[103] "X.101" "X.102"
[105] "X.103" "X.104"
[107] "X.105" "X.106"
[109] "X.107" "X.108"
[111] "X.109" "X.110"
[113] "X.111" "X.112"
[115] "X.113" "X.114"
[117] "X.115" "X.116"
[119] "X.117" "X.118"
[121] "X.119" "X.120"
[123] "X.121" "X.122"
[125] "X.123" "X.124"
[127] "X.125" "X.126"
[129] "X.127" "X.128"
[131] "X.129" "X.130"
[133] "X.131" "X.132"
[135] "X.133" "X.134"
[137] "X.135" "X.136"
[139] "X.137" "X.138"
[141] "X.139" "X.140"
[143] "X.141" "X.142"
[145] "X.143" "X.144"
[147] "X.145" "X.146"
[149] "X.147" "X.148"
[151] "X.149" "X.150"
[153] "X.151" "X.152"
[155] "X.153" "X.154"
[157] "X.155" "X.156"
[159] "X.157" "X.158"
[161] "X.159" "X.160"
[163] "X.161" "X.162"
[165] "X.163" "X.164"
[167] "X.165" "X.166"
[169] "X.167" "X.168"
[171] "X.169" "X.170"
[173] "X.171" "X.172"
[175] "X.173" "X.174"
[177] "X.175" "X.176"
[179] "X.177" "X.178"
[181] "X.179" "X.180"
[183] "X.181" "X.182"
[185] "X.183" "X.184"
[187] "X.185" "X.186"
[189] "X.187" "X.188"
[191] "X.189" "X.190"
[193] "X.191" "X.192"
[195] "X.193" "X.194"
[197] "X.195" "X.196"
[199] "X.197" "X.198"
[201] "X.199" "X.200"
[203] "X.201" "X.202"
[205] "X.203" "X.204"
[207] "X.205" "X.206"
[209] "X.207" "X.208"
[211] "X.209" "X.210"
[213] "X.211" "X.212"
[215] "X.213" "X.214"
[217] "X.215" "X.216"
[219] "X.217" "X.218"
[221] "X.219" "X.220"
[223] "X.221" "X.222"
[225] "X.223" "X.224"
[227] "X.225" "X.226"
[229] "X.227" "X.228"
[231] "X.229" "X.230"
[233] "X.231" "X.232"
[235] "X.233" "X.234"
[237] "X.235" "X.236"
[239] "X.237" "X.238"
[241] "X.239" "X.240"
[243] "X.241" "X.242"
[245] "X.243" "X.244"
[247] "X.245" "X.246"
[249] "X.247" "X.248"
[251] "X.249" "X.250"
[253] "X.251" "X.252"
[255] "X.253" "X.254"
COVID %>%
head()
CountryData
CountryData %>%
nrow()
[1] 256
CountryData %>%
names()
[1] "country" "area" "pop" "growth"
[5] "birth" "death" "migr" "maternal"
[9] "infant" "life" "fert" "health"
[13] "HIVrate" "HIVpeople" "HIVdeath" "obesity"
[17] "underweight" "educ" "unemploymentYouth" "GDP"
[21] "GDPgrowth" "GDPcapita" "saving" "indProd"
[25] "labor" "unemployment" "family" "tax"
[29] "budget" "debt" "inflation" "discount"
[33] "lending" "narrow" "broad" "credit"
[37] "shares" "balance" "exports" "imports"
[41] "gold" "externalDebt" "homeStock" "abroadStock"
[45] "elecProd" "elecCons" "elecExp" "elecImp"
[49] "elecCap" "elecFossil" "elecNuc" "elecHydro"
[53] "elecRenew" "oilProd" "oilExp" "oilImp"
[57] "oilRes" "petroProd" "petroCons" "petroExp"
[61] "petroImp" "gasProd" "gasCons" "gasExp"
[65] "gasImp" "gasRes" "mainlines" "cell"
[69] "netHosts" "netUsers" "airports" "railways"
[73] "roadways" "waterways" "marine" "military"
CountryData %>%
head()
countryRegions
countryRegions %>%
nrow()
[1] 254
countryRegions %>%
names()
[1] "ISO3" "ADMIN" "REGION" "continent" "GEO3major"
[6] "GEO3" "IMAGE24" "GLOCAF" "Stern" "SRESmajor"
[11] "SRES" "GBD" "AVOIDnumeric" "AVOIDname" "LDC"
[16] "SID" "LLDC"
rr countryRegions %>% head()
rr COVID
Since our analysis is focused on the spread of COVID-19, we select only columns which pertain to the number of COVID-19 cases in countries over time.
rr TidyCOVID <- COVID %>% rename(country = total.covid.cases.deaths.per.million ) %>% rename( Code = X ) %>% rename(date = X.1 ) %>% rename(casesPerMillion = X.3) %>% filter(row_number() > 1) %>% subset(select = c(1,2,3,5)) %>% mutate( country = as.character(country) ) %>% mutate(date = mdy(date)) %>% mutate(casesPerMillion = as.integer(casesPerMillion) - 1)
rr TidyCOVID
Ezch instance in TidyCOVID represents a different day in a country’s progression through COVID-19. It provides the country code, the date, and the total cases per million up to that date.
We will extract the ISO3 country code and continent from the countryRegions data. Since naming conventions of countries is variate, the ISO3 country code allows us a standardized demarcation of country with which to join with other data tables.
rr Labels <- countryRegions %>% subset(select = c(3, )) %>% rename(continent = REGION) Labels
We will select the aspects of CountryData relevant to our analysis. These attributes are: area (sq km) and pop (number of people).
rr RelevantCountryData <- CountryData %>% subset(select = c(1,2,3)) %>% mutate(popdensity = pop/area) RelevantCountryData
Calculate the number of cases in each country by multiplying casesPerMillion by the country’s population (in millions).
rr COVIDGrowth <- inner_join(TidyCOVID, RelevantCountryData, by = c()) %>% mutate( = (casesPerMillion * round(pop/1000000, digits = 0))) COVIDGrowth <- COVIDGrowth %>% left_join(Labels, by = c( = 3))
Column `Code`/`ISO3` joining factor and character vector, coercing into character vector
rr COVIDGrowth
This table records the first date that a country recorded a nonzero number of COVID-19 cases. This datagraph will help us visualize when countries first became infected.
rr FirstInstance <- COVIDGrowth %>% filter(cases != 0) %>% group_by(country, continent) %>% summarise(beginningofspread = min(date))
FirstInstance
This table averages the number of case increase per day from the first day a country had COVID-19 to the most recent in the data table (April 5 2020)
rr DailySpread <- left_join(COVIDGrowth, FirstInstance, by = c()) %>% filter(date == \2020-04-05) %>% mutate(dayselapsed = date - beginningofspread) %>% mutate(dailyspread = cases / as.numeric(dayselapsed) ) %>% mutate(dailyspreadpermillion = casesPerMillion / as.numeric(dayselapsed) ) %>% subset(select = c(, , , )) DailySpread\(dailyspread[is.na(DailySpread\)dailyspread)] <- 0 DailySpread\(dailyspreadpermillion[is.na(DailySpread\)dailyspreadpermillion)] <- 0 DailySpread
rr COVIDFinal <- left_join(COVIDGrowth, DailySpread, by = c())
rr COVIDFinal
rr COVIDFinal %>% group_by(date) %>% summarise(totalcases = sum(cases)) %>% ggplot(aes(x = date, y = totalcases)) + geom_point() + xlab() + ylab(-19 Cases)
This graph shows the exponential growth trend that we already knew existed with COVID-19. This confirms that our data is valid in that it accurately represents the trends that have been described by researchers and scientists in the media.
rr na.omit(COVIDFinal) %>% group_by(date, continent) %>% summarise(totalcases = sum(cases)) %>% ggplot(aes(x = date, y = totalcases)) + geom_point() + facet_wrap(~continent) + xlab() + ylab(-19 Cases)
This graph shows the growth in cases by continent. The trend is much stronger in the origin continent of Asia, but also shows strength growing in Europe, Africa, North America, and South America.
rr na.omit(FirstInstance) %>% ggplot(aes(x = beginningofspread, fill = continent)) + geom_dotplot(stackgroups = TRUE, binwidth = 1, binpositions=) + xlab(’s First Case of COVID-19) + theme(panel.background = element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.title.y = element_blank())
This graph shows the progression of the COVID-19 spread across continents. Asia was obviously the first country to have cases, but North America and Europe quickyl followed. South America and Africa both joined in late February, whereas Australia was able to isolate themselves until mid to late March.
rr
COVIDFinal %>% group_by(country) %>% summarise(dailyspread = mean(dailyspread)) %>% arrange(desc(dailyspread)) %>% head(20) %>% ggplot(aes(x = reorder(country, desc(dailyspread)), y= dailyspread)) + geom_bar(stat=, position = ‘stack’, width=.9) + theme(axis.text.x=element_text(angle = 60, hjust = 1)) + scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) + ylab(Number Infected Per Day) + theme(axis.title.x = element_blank())
According to this graph, the countries with the top four infection rates are China, India, Indonesia, and the United States, closely followed by Brazil.
rr COVIDFinal %>% group_by(country) %>% summarise(pop = mean(pop)) %>% arrange(desc(pop)) %>% head(20) %>% ggplot(aes(x = reorder(country, desc(pop)), y= pop)) + geom_bar(stat=, position = ‘stack’, width=.9) + theme(axis.text.x=element_text(angle = 60, hjust = 1)) + scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) + ylab() + theme(axis.title.x = element_blank())
We see a similar trend here, in that the top four (in slightly different order) consist of China, India, Indonesia, the United States, and closely trailing Brazil.
rr na.omit(COVIDFinal) %>% ggplot(aes(x = pop, y = dailyspread, color = continent)) + geom_point() + xlab(of Country) + ylab(Number Infected Per Day)
Here, we can see again the trend that was represented in the two previous graphs, but now they are all in the same data frame. While most of the rest of the world trails with under 10,000 infected per day, the 5 countries with the highest popualtion in the data set have over 15,000 - up to over 50,000- and are far separated from the rest of the pack. Population, while not a direct factor contributing to the level of development of a country, is a decent indicator of the rate of infection.
rr na.omit(COVIDFinal) %>% ggplot(aes(x = pop, y = dailyspread, color = continent)) + geom_point() + xlim(0,500000000) + ylim(0, 40000) + xlab(of Country) + ylab(Number Infected Per Day) + stat_smooth(method = lm)
Removing the largest outliers of China and India, the poitive relationship still holds. Regardless of whether they are involved, the trends are very clearly upward sloping.
rr
COVIDFinal %>% group_by(country) %>% summarise(dailyspreadpermillion = mean(dailyspreadpermillion)) %>% arrange(desc(dailyspreadpermillion)) %>% head(20) %>% ggplot(aes(x = reorder(country, desc(dailyspreadpermillion)), y= dailyspreadpermillion)) + geom_bar(stat=, position = ‘stack’, width=.9) + theme(axis.text.x=element_text(angle = 60, hjust = 1)) + scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) + ylab(Per Million Infected Per Day) + theme(axis.title.x = element_blank())
According the this graph, those with the highest infection rates with the confounding factor of population removed (since it is population per million, it has been adjusted to put all countries on one scale) are Guinea-Bissau, Botswana, Eritrea, El Salvador, and Puerto Rico.
rr
COVIDFinal %>% group_by(country) %>% summarise(popdensity = mean(popdensity)) %>% arrange(desc(popdensity)) %>% head(20) %>% ggplot(aes(x = reorder(country, desc(popdensity)), y= popdensity)) + geom_bar(stat=, position = ‘stack’, width=.9) + theme(axis.text.x=element_text(angle = 60, hjust = 1)) + scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) + ylab(Density (people/sq km)) + theme(axis.title.x = element_blank())
According the this graph, those with the highest population density (number of people per square kilometer) are Monaco, Singapore, and Gibraltar.
rr na.omit(COVIDFinal) %>% ggplot(aes(x = popdensity, y = dailyspreadpermillion)) + geom_point()
On this graph, it is a little chalenging to see the relationship between population density and the daily spread per million. For the sake of this analysis, we will further divide the information.
rr na.omit(COVIDFinal) %>% ggplot(aes(x = popdensity, y = dailyspreadpermillion)) + geom_point() + facet_wrap(~continent) + xlim(0,1500)
Again, it is hard to see a perfect correlaiton here. In general, all the graphs show little t no correlation between the population density and the daily spread per million.
rr WideCountries <- COVIDFinal %>% subset(select = c(,
)) %>% spread(key = date, value = cases) WideCountries[is.na(WideCountries)] <- 0 WideCountries
rr compareCOVID <- function(countryA, countryB) {
A <-
WideCountries %>%
filter(country == countryA)
B <- WideCountries %>% filter(country == countryB) A <- A %>% gather(key = date, value = count) %>% filter(row_number() > 1) %>% mutate(date = lubridate::ymd(date)) %>% mutate(count = as.numeric(count)) %>% mutate(country = countryA)
B <- B %>% gather(key = date, value = count) %>% filter(row_number() > 1) %>% mutate(date = lubridate::ymd(date))%>% mutate(count = as.numeric(count)) %>% mutate(country = countryB)
GG <- rbind(A,B)
return( ggplot(GG, aes(x = date, y = count, color = country)) + stat_smooth(formula = y ~ x, method = ) + ylab(of COVID-19 Cases) + xlab())
}
This function willl allow us to create a direct comparison between two countries based on the date and the number of COVID-19 cases in each country. This way, we can compare the spread between nations and identify similar trends in different parts of the world.
rr compareCOVID(, States)
rr compareCOVID(, )
rr compareCOVID(Rico, )
Ultimately, COVID-19 still remains a mystery to us. We were able to find a positive correlation between the population of a country and its spread. This indicates that the more people a country has, the more likely the infection is to affect a large portion of the people. However, we struggled to find the correlation that we were looking for between the population density and the infection rate per million. This indicated that the correlation is far less evident once standardized and confounding factors are removed.
In the future when more updated information about COVID-19 is released, some of these trends may be more relevant or easy to define. However, with the little that we know, it was not possible to make this conclusion with confidence.